# -*- coding: utf-8 -*-

import random
import requests
import time
from bs4 import BeautifulSoup
import re

user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'
headers = {'User-Agent': user_agent}
session = requests.session()

localDir ="C:\\test\\files\\"

def getAndSaveUrl( url):
    # print("抓取 "+url)
    page = session.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'lxml')

    contents = soup.findAll('div', id='contentleft')
    for child in contents:
        item = child.findAll('h2')
        title= item[0].string
        # 去除标题中的特殊字符
        title = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。?|？、~@#￥%……&*（）]+", "", title)
        filePath = localDir  +title+ ".md"
        f = open(filePath, 'w')
        print(filePath+"创建成功")

        # 以下保存正文
        f = open(filePath, 'a', encoding='utf-8')
        f.write('#  ')
        f.write(title+ ' \n ')
        f.write('```xml \n ')

        pvalues = child.findAll("span" ,attrs={'style': 'font-size:18px;'})
        for pvalue in pvalues:
            content = pvalue.string
            if not content is None:
                # print(content)
                f = open(filePath, 'a' ,encoding='utf-8')
                f.write( content+'\n' )
        f.write('``` \n ')
        f.close()
    print("完成 "+url)


# 开始运行 抓取 保存
fileDest =localDir+"网址.txt"
f=open(fileDest,'r')
lines=f.readlines()
sum = len(lines)
index = 0;
for line in lines:
    index= index+1
    sleepTime = random.uniform(0, 2)
    time.sleep(sleepTime)  # 休眠 1秒
    print("进度%d/%d   ,  休眠%f秒" % (index, sum, sleepTime))
    # 去掉末尾的换行符
    url = line.strip('\n')
    try:
     getAndSaveUrl(url)
    except:
        filePath = localDir  + "errorLog.txt"
        f = open(filePath, 'a', encoding='utf-8')
        print("!!! 抓取失败 " + url)
        f.write(url+'\n')
















